# importing the Necessary libraries
library(dplyr)
#Extracting the current working directory
getwd()
## [1] "/Users/kodeboyina/Documents/Kent State/Sem2/BA/Group Project"
#Loading House_Prices csv data Import the data set into R
House_Prices <- read.csv("data/House_Prices.csv", header = TRUE, sep = ",", stringsAsFactors = TRUE)
Data Over View - Descriptive Analysis
Data Overview, Providing summary of the data set, including the number of observations and variables, the data types and ranges for each variable
#Observing the first 10 Observations
head(House_Prices, n=10L)
## LotArea OverallQual YearBuilt YearRemodAdd BsmtFinSF1 FullBath HalfBath
## 1 8450 7 2003 2003 706 2 1
## 2 9600 6 1976 1976 978 2 0
## 3 11250 7 2001 2002 486 2 1
## 4 9550 7 1915 1970 216 1 0
## 5 14260 8 2000 2000 655 2 1
## 6 14115 5 1993 1995 732 1 1
## 7 10084 8 2004 2005 1369 2 0
## 8 10382 7 1973 1973 859 2 1
## 9 6120 7 1931 1950 0 2 0
## 10 7420 5 1939 1950 851 1 0
## BedroomAbvGr TotRmsAbvGrd Fireplaces GarageArea YrSold SalePrice
## 1 3 8 0 548 2008 208500
## 2 3 6 1 460 2007 181500
## 3 3 6 1 608 2008 223500
## 4 3 7 1 642 2006 140000
## 5 4 9 1 836 2008 250000
## 6 1 5 0 480 2009 143000
## 7 3 7 1 636 2007 307000
## 8 3 7 2 484 2009 200000
## 9 2 8 2 468 2008 129900
## 10 2 5 2 205 2008 118000
#Shape of the data set
dim(House_Prices)
## [1] 900 13
#Printing the Structure of the data
str(House_Prices)
## 'data.frame': 900 obs. of 13 variables:
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd: int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr: int 3 3 3 3 4 1 3 3 2 2 ...
## $ TotRmsAbvGrd: int 8 6 6 7 9 5 7 7 8 5 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
#Summary of the House Prices dataset
summary(House_Prices)
## LotArea OverallQual YearBuilt YearRemodAdd BsmtFinSF1
## Min. : 1491 Min. : 1.00 Min. :1880 Min. :1950 Min. : 0
## 1st Qu.: 7585 1st Qu.: 5.00 1st Qu.:1954 1st Qu.:1968 1st Qu.: 0
## Median : 9442 Median : 6.00 Median :1973 Median :1994 Median : 384
## Mean : 10795 Mean : 6.14 Mean :1971 Mean :1985 Mean : 446
## 3rd Qu.: 11618 3rd Qu.: 7.00 3rd Qu.:2000 3rd Qu.:2004 3rd Qu.: 729
## Max. :215245 Max. :10.00 Max. :2010 Max. :2010 Max. :2260
## FullBath HalfBath BedroomAbvGr TotRmsAbvGrd Fireplaces
## Min. :0.00 Min. :0.000 Min. :0.00 Min. : 2.00 Min. :0.000
## 1st Qu.:1.00 1st Qu.:0.000 1st Qu.:2.00 1st Qu.: 5.00 1st Qu.:0.000
## Median :2.00 Median :0.000 Median :3.00 Median : 6.00 Median :1.000
## Mean :1.56 Mean :0.386 Mean :2.84 Mean : 6.48 Mean :0.628
## 3rd Qu.:2.00 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.: 7.00 3rd Qu.:1.000
## Max. :3.00 Max. :2.000 Max. :8.00 Max. :14.00 Max. :3.000
## GarageArea YrSold SalePrice
## Min. : 0 Min. :2006 Min. : 34900
## 1st Qu.: 336 1st Qu.:2007 1st Qu.:130000
## Median : 480 Median :2008 Median :163000
## Mean : 473 Mean :2008 Mean :183108
## 3rd Qu.: 576 3rd Qu.:2009 3rd Qu.:216878
## Max. :1390 Max. :2010 Max. :755000
library(skimr)
skim(House_Prices)
| Name | House_Prices |
| Number of rows | 900 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| LotArea | 0 | 1 | 10794.60 | 11942.21 | 1491 | 7585 | 9442 | 11618.2 | 215245 | ▇▁▁▁▁ |
| OverallQual | 0 | 1 | 6.14 | 1.38 | 1 | 5 | 6 | 7.0 | 10 | ▁▁▇▅▁ |
| YearBuilt | 0 | 1 | 1971.45 | 30.01 | 1880 | 1954 | 1973 | 2000.0 | 2010 | ▁▂▃▆▇ |
| YearRemodAdd | 0 | 1 | 1985.33 | 20.34 | 1950 | 1968 | 1994 | 2004.0 | 2010 | ▅▂▂▃▇ |
| BsmtFinSF1 | 0 | 1 | 446.53 | 446.52 | 0 | 0 | 384 | 728.8 | 2260 | ▇▅▂▁▁ |
| FullBath | 0 | 1 | 1.56 | 0.56 | 0 | 1 | 2 | 2.0 | 3 | ▁▇▁▇▁ |
| HalfBath | 0 | 1 | 0.39 | 0.50 | 0 | 0 | 0 | 1.0 | 2 | ▇▁▅▁▁ |
| BedroomAbvGr | 0 | 1 | 2.84 | 0.82 | 0 | 2 | 3 | 3.0 | 8 | ▁▇▁▁▁ |
| TotRmsAbvGrd | 0 | 1 | 6.48 | 1.61 | 2 | 5 | 6 | 7.0 | 14 | ▂▇▇▁▁ |
| Fireplaces | 0 | 1 | 0.63 | 0.66 | 0 | 0 | 1 | 1.0 | 3 | ▇▇▁▂▁ |
| GarageArea | 0 | 1 | 472.61 | 208.85 | 0 | 336 | 480 | 576.0 | 1390 | ▂▇▃▁▁ |
| YrSold | 0 | 1 | 2007.84 | 1.32 | 2006 | 2007 | 2008 | 2009.0 | 2010 | ▇▇▇▇▅ |
| SalePrice | 0 | 1 | 183107.92 | 81908.18 | 34900 | 130000 | 163000 | 216877.8 | 755000 | ▇▅▁▁▁ |
#Loading the data Explorer Library
library(DataExplorer)
## Plot basic description for House_Prices data
## View basic description for House_Prices data
introduce(House_Prices)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 900 13 0 13 0
## total_missing_values complete_rows total_observations memory_usage
## 1 0 900 11700 50408
#Checking for the missing values in the House Prices
missing_counts = colSums(is.na(House_Prices))
print(missing_counts)
## LotArea OverallQual YearBuilt YearRemodAdd BsmtFinSF1 FullBath
## 0 0 0 0 0 0
## HalfBath BedroomAbvGr TotRmsAbvGrd Fireplaces GarageArea YrSold
## 0 0 0 0 0 0
## SalePrice
## 0
#Plotting the percentage of missing values
plot_intro(House_Prices)
# Create a bar plot to visualize the missing values
barplot(missing_counts, main = "Null Values", xlab = "Variables", ylab = "Count")
# Printing the Variable types
variable_types <- sapply(House_Prices, class)
print(variable_types)
## LotArea OverallQual YearBuilt YearRemodAdd BsmtFinSF1 FullBath
## "integer" "integer" "integer" "integer" "integer" "integer"
## HalfBath BedroomAbvGr TotRmsAbvGrd Fireplaces GarageArea YrSold
## "integer" "integer" "integer" "integer" "integer" "integer"
## SalePrice
## "integer"
Based on the above observation all the data types are integers and there are no categorical variables in our data
#Descriptive Analysis of the Numeric Variables
numeric_vars <- c("LotArea", "OverallQual", "BsmtFinSF1", "FullBath", "HalfBath", "BedroomAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageArea", "SalePrice")
# Set the layout for the plot grid
par(mfrow = c(2, 3))
# Create box plots for each numerical variable
for (var in numeric_vars) {
boxplot(House_Prices[[var]], main = paste("Box Plot : ", var))
}
# Create histogram plots for each numerical variable
par(mfrow = c(2, 3))
# Set the layout for the plot grid
for (var in numeric_vars)
{
hist(House_Prices[[var]], main = paste("Histogram Plot : ", var), xlab = var)
}
library(corrplot)
# Create a correlation matrix
correlation_matrix <- cor(House_Prices)
# Specify custom colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(20)
# Plot a heatmap of the correlation matrix with custom color and title
corrplot(correlation_matrix,
method = "color", # Use color to represent correlation values
col = color_scheme, # Specify custom color scheme
title = "House Prices Correlation Heatmap", # Specify custom title
tl.cex = 0.8, # Adjust text size for column and row names
mar = c(2, 2, 1, 1) # Adjust margins (bottom, top, left, right)
)
# Compute correlation matrix
print(correlation_matrix)
## LotArea OverallQual YearBuilt YearRemodAdd BsmtFinSF1 FullBath
## LotArea 1.000000 0.09621 0.007639 0.012302 0.2070352 0.12855
## OverallQual 0.096209 1.00000 0.569225 0.547469 0.2273585 0.55071
## YearBuilt 0.007639 0.56922 1.000000 0.569604 0.2645981 0.46267
## YearRemodAdd 0.012302 0.54747 0.569604 1.000000 0.1322066 0.43500
## BsmtFinSF1 0.207035 0.22736 0.264598 0.132207 1.0000000 0.05284
## FullBath 0.128547 0.55071 0.462667 0.434997 0.0528409 1.00000
## HalfBath -0.002609 0.30429 0.275349 0.205962 -0.0030281 0.12918
## BedroomAbvGr 0.089578 0.11259 -0.046072 0.004014 -0.1160040 0.36402
## TotRmsAbvGrd 0.153195 0.45870 0.128530 0.238986 0.0592867 0.56632
## Fireplaces 0.265592 0.39349 0.164903 0.122247 0.2929777 0.22522
## GarageArea 0.152720 0.59817 0.496031 0.379742 0.2869558 0.41051
## YrSold -0.021080 -0.04878 0.008918 0.036270 -0.0007844 -0.02034
## SalePrice 0.264372 0.79621 0.526634 0.522177 0.4046632 0.55801
## HalfBath BedroomAbvGr TotRmsAbvGrd Fireplaces GarageArea
## LotArea -0.002609 0.089578 0.15320 0.26559 0.15272
## OverallQual 0.304286 0.112591 0.45870 0.39349 0.59817
## YearBuilt 0.275349 -0.046072 0.12853 0.16490 0.49603
## YearRemodAdd 0.205962 0.004014 0.23899 0.12225 0.37974
## BsmtFinSF1 -0.003028 -0.116004 0.05929 0.29298 0.28696
## FullBath 0.129185 0.364024 0.56632 0.22522 0.41051
## HalfBath 1.000000 0.203046 0.33171 0.21738 0.21842
## BedroomAbvGr 0.203046 1.000000 0.67145 0.07540 0.08123
## TotRmsAbvGrd 0.331714 0.671454 1.00000 0.31038 0.36196
## Fireplaces 0.217375 0.075402 0.31038 1.00000 0.26626
## GarageArea 0.218421 0.081228 0.36196 0.26626 1.00000
## YrSold -0.023044 -0.028930 -0.06891 -0.06196 -0.04385
## SalePrice 0.304740 0.164427 0.57736 0.46863 0.65604
## YrSold SalePrice
## LotArea -0.0210802 0.26437
## OverallQual -0.0487804 0.79621
## YearBuilt 0.0089179 0.52663
## YearRemodAdd 0.0362696 0.52218
## BsmtFinSF1 -0.0007844 0.40466
## FullBath -0.0203373 0.55801
## HalfBath -0.0230436 0.30474
## BedroomAbvGr -0.0289300 0.16443
## TotRmsAbvGrd -0.0689141 0.57736
## Fireplaces -0.0619571 0.46863
## GarageArea -0.0438451 0.65604
## YrSold 1.0000000 -0.04627
## SalePrice -0.0462718 1.00000
# Specify custom colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(20)
# Plot a heat map of the correlation matrix with custom color, title, and coefficients
corrplot(correlation_matrix,
method = "color", # Use color to represent correlation values
type = "upper", # Display only the upper triangle of the matrix
tl.col = "black", # Color of text for column and row names
tl.srt = 45, # Rotation angle of text
tl.cex = 0.8, # Text size for column and row names
tl.offset = 1, # Offset of text from the heat map
addCoef.col = "black", # Color of correlation coefficients
number.cex = 0.7, # Text size for correlation coefficients
number.digits = 2, # Number of digits for correlation coefficients
diag = FALSE, # Exclude diagonal elements
outline = TRUE # Display outline around each cell
)
The correlation matrix provides information about the relationships between different variables in your dataset. In the provided matrix, the values range from -1 to 1, where +1 indicates a perfect positive correlation,-1 indicates a perfect negative correlation, and 0 indicates no correlation.. The correlation matrix provides insights into the relationships between various features and the response variable, SalePrice. Notably, OverallQual exhibits a strong positive correlation of 0.79621 with SalePrice, indicating a significant influence on the home’s sale value. GarageArea (correlation of 0.65604) and TotRmsAbvGrd (correlation of 0.57736) also show notable positive correlations, suggesting their impact on the sale price. These high-correlation relationships highlight the importance of these features in predicting home sale prices, emphasizing their relevance for effective analysis and model construction.
library(MASS)
#Linear Model with All the Variables included - Full model with all predictor variables
model <- lm(House_Prices$SalePrice ~ House_Prices$LotArea+House_Prices$OverallQual+House_Prices$YearBuilt+House_Prices$YearRemodAdd+House_Prices$BsmtFinSF1+House_Prices$FullBath+House_Prices$HalfBath+House_Prices$BedroomAbvGr+House_Prices$TotRmsAbvGrd+House_Prices$Fireplaces+House_Prices$GarageArea+House_Prices$YrSold)
summary(model)
##
## Call:
## lm(formula = House_Prices$SalePrice ~ House_Prices$LotArea +
## House_Prices$OverallQual + House_Prices$YearBuilt + House_Prices$YearRemodAdd +
## House_Prices$BsmtFinSF1 + House_Prices$FullBath + House_Prices$HalfBath +
## House_Prices$BedroomAbvGr + House_Prices$TotRmsAbvGrd + House_Prices$Fireplaces +
## House_Prices$GarageArea + House_Prices$YrSold)
##
## Residuals:
## Min 1Q Median 3Q Max
## -286336 -20369 -2819 16607 349565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.38e+06 1.85e+06 -0.75 0.456
## House_Prices$LotArea 7.11e-01 1.08e-01 6.59 7.7e-11 ***
## House_Prices$OverallQual 2.30e+04 1.42e+03 16.21 < 2e-16 ***
## House_Prices$YearBuilt 1.29e+02 6.08e+01 2.13 0.034 *
## House_Prices$YearRemodAdd 3.86e+02 7.84e+01 4.92 1.0e-06 ***
## House_Prices$BsmtFinSF1 3.10e+01 3.07e+00 10.10 < 2e-16 ***
## House_Prices$FullBath 5.88e+03 3.24e+03 1.82 0.069 .
## House_Prices$HalfBath 3.05e+03 2.79e+03 1.09 0.274
## House_Prices$BedroomAbvGr -1.14e+04 2.16e+03 -5.26 1.8e-07 ***
## House_Prices$TotRmsAbvGrd 1.59e+04 1.34e+03 11.84 < 2e-16 ***
## House_Prices$Fireplaces 9.58e+03 2.17e+03 4.42 1.1e-05 ***
## House_Prices$GarageArea 6.11e+01 7.72e+00 7.91 7.6e-15 ***
## House_Prices$YrSold 1.30e+02 9.22e+02 0.14 0.887
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36300 on 887 degrees of freedom
## Multiple R-squared: 0.807, Adjusted R-squared: 0.804
## F-statistic: 308 on 12 and 887 DF, p-value: <2e-16
step_model <- stepAIC(model, direction = "both") # Stepwise regression with both forward and backward
## Start: AIC=18910
## House_Prices$SalePrice ~ House_Prices$LotArea + House_Prices$OverallQual +
## House_Prices$YearBuilt + House_Prices$YearRemodAdd + House_Prices$BsmtFinSF1 +
## House_Prices$FullBath + House_Prices$HalfBath + House_Prices$BedroomAbvGr +
## House_Prices$TotRmsAbvGrd + House_Prices$Fireplaces + House_Prices$GarageArea +
## House_Prices$YrSold
##
## Df Sum of Sq RSS AIC
## - House_Prices$YrSold 1 2.64e+07 1.17e+12 18908
## - House_Prices$HalfBath 1 1.57e+09 1.17e+12 18910
## <none> 1.17e+12 18910
## - House_Prices$FullBath 1 4.35e+09 1.17e+12 18912
## - House_Prices$YearBuilt 1 5.95e+09 1.17e+12 18913
## - House_Prices$Fireplaces 1 2.56e+10 1.19e+12 18928
## - House_Prices$YearRemodAdd 1 3.18e+10 1.20e+12 18933
## - House_Prices$BedroomAbvGr 1 3.64e+10 1.20e+12 18936
## - House_Prices$LotArea 1 5.70e+10 1.22e+12 18951
## - House_Prices$GarageArea 1 8.23e+10 1.25e+12 18970
## - House_Prices$BsmtFinSF1 1 1.34e+11 1.30e+12 19006
## - House_Prices$TotRmsAbvGrd 1 1.85e+11 1.35e+12 19041
## - House_Prices$OverallQual 1 3.46e+11 1.51e+12 19142
##
## Step: AIC=18908
## House_Prices$SalePrice ~ House_Prices$LotArea + House_Prices$OverallQual +
## House_Prices$YearBuilt + House_Prices$YearRemodAdd + House_Prices$BsmtFinSF1 +
## House_Prices$FullBath + House_Prices$HalfBath + House_Prices$BedroomAbvGr +
## House_Prices$TotRmsAbvGrd + House_Prices$Fireplaces + House_Prices$GarageArea
##
## Df Sum of Sq RSS AIC
## - House_Prices$HalfBath 1 1.58e+09 1.17e+12 18908
## <none> 1.17e+12 18908
## - House_Prices$FullBath 1 4.36e+09 1.17e+12 18910
## + House_Prices$YrSold 1 2.64e+07 1.17e+12 18910
## - House_Prices$YearBuilt 1 5.96e+09 1.17e+12 18911
## - House_Prices$Fireplaces 1 2.56e+10 1.19e+12 18926
## - House_Prices$YearRemodAdd 1 3.21e+10 1.20e+12 18931
## - House_Prices$BedroomAbvGr 1 3.64e+10 1.20e+12 18934
## - House_Prices$LotArea 1 5.70e+10 1.22e+12 18949
## - House_Prices$GarageArea 1 8.23e+10 1.25e+12 18968
## - House_Prices$BsmtFinSF1 1 1.34e+11 1.30e+12 19005
## - House_Prices$TotRmsAbvGrd 1 1.85e+11 1.35e+12 19039
## - House_Prices$OverallQual 1 3.46e+11 1.51e+12 19140
##
## Step: AIC=18908
## House_Prices$SalePrice ~ House_Prices$LotArea + House_Prices$OverallQual +
## House_Prices$YearBuilt + House_Prices$YearRemodAdd + House_Prices$BsmtFinSF1 +
## House_Prices$FullBath + House_Prices$BedroomAbvGr + House_Prices$TotRmsAbvGrd +
## House_Prices$Fireplaces + House_Prices$GarageArea
##
## Df Sum of Sq RSS AIC
## <none> 1.17e+12 18908
## - House_Prices$FullBath 1 3.34e+09 1.17e+12 18908
## + House_Prices$HalfBath 1 1.58e+09 1.17e+12 18908
## + House_Prices$YrSold 1 2.86e+07 1.17e+12 18910
## - House_Prices$YearBuilt 1 8.14e+09 1.18e+12 18912
## - House_Prices$Fireplaces 1 2.77e+10 1.20e+12 18927
## - House_Prices$YearRemodAdd 1 3.24e+10 1.20e+12 18930
## - House_Prices$BedroomAbvGr 1 3.57e+10 1.20e+12 18933
## - House_Prices$LotArea 1 5.64e+10 1.22e+12 18948
## - House_Prices$GarageArea 1 8.23e+10 1.25e+12 18967
## - House_Prices$BsmtFinSF1 1 1.33e+11 1.30e+12 19003
## - House_Prices$TotRmsAbvGrd 1 2.03e+11 1.37e+12 19050
## - House_Prices$OverallQual 1 3.51e+11 1.52e+12 19142
# Print the summary of the final stepwise regression model
summary(step_model)
##
## Call:
## lm(formula = House_Prices$SalePrice ~ House_Prices$LotArea +
## House_Prices$OverallQual + House_Prices$YearBuilt + House_Prices$YearRemodAdd +
## House_Prices$BsmtFinSF1 + House_Prices$FullBath + House_Prices$BedroomAbvGr +
## House_Prices$TotRmsAbvGrd + House_Prices$Fireplaces + House_Prices$GarageArea)
##
## Residuals:
## Min 1Q Median 3Q Max
## -284907 -20317 -2692 16283 350668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.15e+06 1.57e+05 -7.34 4.7e-13 ***
## House_Prices$LotArea 7.06e-01 1.08e-01 6.55 9.8e-11 ***
## House_Prices$OverallQual 2.31e+04 1.41e+03 16.34 < 2e-16 ***
## House_Prices$YearBuilt 1.46e+02 5.88e+01 2.49 0.013 *
## House_Prices$YearRemodAdd 3.88e+02 7.82e+01 4.97 8.2e-07 ***
## House_Prices$BsmtFinSF1 3.05e+01 3.04e+00 10.06 < 2e-16 ***
## House_Prices$FullBath 4.98e+03 3.12e+03 1.59 0.111
## House_Prices$BedroomAbvGr -1.12e+04 2.15e+03 -5.21 2.3e-07 ***
## House_Prices$TotRmsAbvGrd 1.62e+04 1.30e+03 12.43 < 2e-16 ***
## House_Prices$Fireplaces 9.88e+03 2.15e+03 4.60 4.9e-06 ***
## House_Prices$GarageArea 6.10e+01 7.71e+00 7.91 7.5e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36200 on 889 degrees of freedom
## Multiple R-squared: 0.806, Adjusted R-squared: 0.804
## F-statistic: 370 on 10 and 889 DF, p-value: <2e-16
##Several predictor variables (e.g., LotArea, OverallQual, BsmtFinSF1, etc.) have significant coefficients.
##The Multiple R-squared value suggests that the model explains a substantial proportion of the variance in the dependent variable.
# Fit the ANOVA model for all the variables
model <- aov(House_Prices$SalePrice ~ House_Prices$LotArea+House_Prices$OverallQual+House_Prices$YearBuilt+House_Prices$YearRemodAdd+House_Prices$BsmtFinSF1+House_Prices$FullBath+House_Prices$HalfBath+House_Prices$BedroomAbvGr+House_Prices$TotRmsAbvGrd+House_Prices$Fireplaces+House_Prices$GarageArea+House_Prices$YrSold)
# Perform ANOVA analysis
anova_result <- anova(model)
# View the ANOVA table
print(anova_result)
## Analysis of Variance Table
##
## Response: House_Prices$SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## House_Prices$LotArea 1 4.22e+11 4.22e+11 320.53 < 2e-16 ***
## House_Prices$OverallQual 1 3.62e+12 3.62e+12 2750.00 < 2e-16 ***
## House_Prices$YearBuilt 1 6.07e+10 6.07e+10 46.15 2.0e-11 ***
## House_Prices$YearRemodAdd 1 3.93e+10 3.93e+10 29.92 5.9e-08 ***
## House_Prices$BsmtFinSF1 1 2.10e+11 2.10e+11 159.64 < 2e-16 ***
## House_Prices$FullBath 1 9.75e+10 9.75e+10 74.14 < 2e-16 ***
## House_Prices$HalfBath 1 4.97e+10 4.97e+10 37.79 1.2e-09 ***
## House_Prices$BedroomAbvGr 1 8.36e+09 8.36e+09 6.35 0.012 *
## House_Prices$TotRmsAbvGrd 1 2.56e+11 2.56e+11 194.43 < 2e-16 ***
## House_Prices$Fireplaces 1 2.30e+10 2.30e+10 17.49 3.2e-05 ***
## House_Prices$GarageArea 1 8.23e+10 8.23e+10 62.56 7.7e-15 ***
## House_Prices$YrSold 1 2.64e+07 2.64e+07 0.02 0.887
## Residuals 887 1.17e+12 1.32e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
House_Prices - OverallQual: The overall quality of the house is highly significant (p-value < 2e-16) and has a substantial effect on the sale price.
Other predictors like LotArea, BsmtFinSF1, FullBath, TotRmsAbvGrd, and GarageArea are also highly significant.
Year Built and YearRemodAdd are moderately significant predictors with p-values of 2.006e-11 and 5.864e-08, respectively.
#Evaluating the Test data set with all the features selected from the data set
raw_data_test_all <- read.csv("data/Predict.csv")
# Assuming 'model' is your trained linear regression model
predictions_all <- predict(model, newdata = raw_data_test_all)
# Assuming 'actual_values' is the column in 'test_data' containing the actual SalePrice values
actual_values <- raw_data_test_all$SalePrice
# Calculate evaluation metrics
mse <- mean((predictions_all - actual_values)^2)
rmse <- sqrt(mse)
mae <- mean(abs(predictions_all - actual_values))
r_squared <- 1 - (sum((actual_values - predictions_all)^2) / sum((actual_values - mean(actual_values))^2))
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 9.622e+09
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 98093
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 78105
cat("R-squared:", r_squared, "\n")
## R-squared: -24.98
The large values for MSE, RMSE, and MAE suggest that the model’s predictions have substantial errors, and there is a considerable difference between predicted and actual values. The negative R-squared is concerning and implies that the model is not explaining the variance in Sale Prices. This could be due to over fitting, model misspecification, or the presence of outlines
End of the Analysis report : Now Let us consider construction of the model and before that we are converting the data for the years built,years modifie to Age of the Property and Age of Modifications also combining the bathrooms data in to a single column - Total Bathrooms and Also Normalizing the data set
## let construct the Regression model by considering the above assumptions
# A) Build a regression and decision tree model that can accurately predict the price of a house based on several predictors (you select appropriate features) for predicting the data of the house prices we need to consider the Training data and then analyse the data
#Loading the data sets
setwd("/Users/kodeboyina/Documents/Kent State/Sem2/BA/Group Project")
#Training Data set
raw_data <- read.csv("data/House_Prices.csv")
#From the above data set there are no missing values
#LotArea: Lot size in square feet
#BsmtFinSF1: Finished square feet
#GarageArea: Size of garage in square feet
#SalePrice: The sale price of the property.
#Having the outliers and we are normalizing the data for Scale Consistency,Equal Weight for Features
# Specify the variables to normalize
variables_to_normalize <- c("LotArea", "BsmtFinSF1", "GarageArea")
# Extract the selected variables
data_to_normalize <- raw_data[, variables_to_normalize]
# Calculate mean and standard deviation for normalization
means <- colMeans(data_to_normalize)
std_devs <- apply(data_to_normalize, 2, sd)
# Z-score normalization
normalized_data <- scale(data_to_normalize, center = means, scale = std_devs)
# Add the normalized variables back to the original data
raw_data[, variables_to_normalize] <- normalized_data
#Converting the baths information to Half and Full baths
raw_data$ConvertedHalfBath <- ifelse(raw_data$HalfBath == 0, 0,
ifelse(raw_data$HalfBath == 1, 0.5,
ifelse(raw_data$HalfBath == 2, 1, NA)))
# Calculate ages from the YrSold to find the property of the age
raw_data$AgeBuilt <- raw_data$YrSold - raw_data$YearBuilt
raw_data$AgeRemodAdd <- raw_data$YrSold - raw_data$YearRemodAdd
raw_data$TotalBathrooms = raw_data$FullBath + raw_data$ConvertedHalfBath
library(corrplot)
# Assuming 'raw_data' is your data frame
House_num <- raw_data %>%
dplyr::select(LotArea, OverallQual, BsmtFinSF1, BedroomAbvGr, TotRmsAbvGrd,
Fireplaces, GarageArea, SalePrice, AgeBuilt, AgeRemodAdd, TotalBathrooms)
# Create a correlation matrix
correlation_matrix <- cor(House_num)
# Specify custom colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(20)
# Plot a heat map of the correlation matrix with custom color and title
corrplot(correlation_matrix,
method = "color", # Use color to represent correlation values
col = color_scheme, # Specify custom color scheme
title = "House Prices Correlation Heatmap", # Specify custom title
tl.cex = 0.8, # Adjust text size for column and row names
mar = c(2, 2, 1, 1) # Adjust margins (bottom, top, left, right)
)
# Compute correlation matrix
print(correlation_matrix)
## LotArea OverallQual BsmtFinSF1 BedroomAbvGr TotRmsAbvGrd
## LotArea 1.000000 0.09621 0.20704 0.089578 0.15320
## OverallQual 0.096209 1.00000 0.22736 0.112591 0.45870
## BsmtFinSF1 0.207035 0.22736 1.00000 -0.116004 0.05929
## BedroomAbvGr 0.089578 0.11259 -0.11600 1.000000 0.67145
## TotRmsAbvGrd 0.153195 0.45870 0.05929 0.671454 1.00000
## Fireplaces 0.265592 0.39349 0.29298 0.075402 0.31038
## GarageArea 0.152720 0.59817 0.28696 0.081228 0.36196
## SalePrice 0.264372 0.79621 0.40466 0.164427 0.57736
## AgeBuilt -0.008562 -0.57104 -0.26448 0.044773 -0.13149
## AgeRemodAdd -0.013675 -0.55077 -0.13229 -0.005895 -0.24352
## TotalBathrooms 0.110986 0.59879 0.04486 0.396552 0.62311
## Fireplaces GarageArea SalePrice AgeBuilt AgeRemodAdd
## LotArea 0.2656 0.15272 0.2644 -0.008562 -0.013675
## OverallQual 0.3935 0.59817 0.7962 -0.571043 -0.550774
## BsmtFinSF1 0.2930 0.28696 0.4047 -0.264480 -0.132290
## BedroomAbvGr 0.0754 0.08123 0.1644 0.044773 -0.005895
## TotRmsAbvGrd 0.3104 0.36196 0.5774 -0.131488 -0.243523
## Fireplaces 1.0000 0.26626 0.4686 -0.167534 -0.126303
## GarageArea 0.2663 1.00000 0.6560 -0.497674 -0.382685
## SalePrice 0.4686 0.65604 1.0000 -0.528367 -0.525312
## AgeBuilt -0.1675 -0.49767 -0.5284 1.000000 0.570100
## AgeRemodAdd -0.1263 -0.38268 -0.5253 0.570100 1.000000
## TotalBathrooms 0.2812 0.44306 0.6053 -0.511646 -0.461382
## TotalBathrooms
## LotArea 0.11099
## OverallQual 0.59879
## BsmtFinSF1 0.04486
## BedroomAbvGr 0.39655
## TotRmsAbvGrd 0.62311
## Fireplaces 0.28121
## GarageArea 0.44306
## SalePrice 0.60533
## AgeBuilt -0.51165
## AgeRemodAdd -0.46138
## TotalBathrooms 1.00000
# Specify custom colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(20)
# Plot a heatmap of the correlation matrix with custom color, title, and coefficients
corrplot(correlation_matrix,
method = "color", # Use color to represent correlation values
type = "upper", # Display only the upper triangle of the matrix
tl.col = "black", # Color of text for column and row names
tl.srt = 45, # Rotation angle of text
tl.cex = 0.8, # Text size for column and row names
tl.offset = 1, # Offset of text from the heatmap
addCoef.col = "black", # Color of correlation coefficients
number.cex = 0.7, # Text size for correlation coefficients
number.digits = 2, # Number of digits for correlation coefficients
diag = FALSE, # Exclude diagonal elements
outline = TRUE # Display outline around each cell
)
Strong Positive Correlations with SalePrice:.
OverallQual (0.796): This variable has the highest positive
correlation with SalePrice. The overall quality of the house, as rated
by a numeric scale, is a strong predictor of the sale
price..
Moderate Positive Correlations with SalePrice:. GarageArea (0.656): The size of the garage has a moderate positive correlation with SalePrice.. TotalBathrooms (0.605): The total number of bathrooms shows a moderate positive correlation with SalePrice.. TotRmsAbvGrd (0.577): The total rooms above ground also has a moderate positive correlation with SalePrice.. Fireplaces (0.469): The number of fireplaces in the house has a moderate positive correlation with SalePrice..
Negative Correlations with SalePrice:. AgeBuilt (-0.528): The age of the house (how many years it has been since it was built) has a moderate negative correlation with SalePrice.. AgeRemodAdd (-0.525): The age since the last remodel also has a moderate negative correlation with SalePrice..
# Assuming 'data' is your data frame with independent variables
# Load the 'car' package
library(car)
vif_model <- lm(SalePrice ~ OverallQual +BsmtFinSF1+ GarageArea + BedroomAbvGr + TotRmsAbvGrd + Fireplaces + AgeBuilt + AgeRemodAdd + TotalBathrooms, data = House_num)
vif_values <- vif(vif_model)
print(vif_values)
## OverallQual BsmtFinSF1 GarageArea BedroomAbvGr TotRmsAbvGrd
## 2.606 1.249 1.761 2.121 3.171
## Fireplaces AgeBuilt AgeRemodAdd TotalBathrooms
## 1.327 2.254 1.732 2.603
Variance Inflation Factors (VIF) for different predictor variables in a linear regression model variance of an estimated regression coefficient increases if your predictors are correlated.. Here we can see that VIF vales are less than 5 are considered acceptable, indicating a low to moderate level of multicollinearity
#Using pairs to calculate multi-collinearity
# Assuming 'House_num' is your data frame
# Specify the columns you want to include in the pairs plot
# Specify Columns to Plot
columns_to_plot <- c("OverallQual", "BsmtFinSF1", "GarageArea", "BedroomAbvGr", "TotRmsAbvGrd", "TotalBathrooms","Fireplaces", "AgeBuilt", "AgeRemodAdd")
# Plot the Pairs
pairs(House_num[, columns_to_plot])
All the values have a VIF values below 5 are often considered
acceptable and there is no significant multi-collinearity in the data.
Here we can see that the there is linear relation ship between the
Number of Bedrooms above the ground and Total Rooms above ground so in
order to remove multi-collinearity we are removing the variable bedrooms
above ground for better prediction over test data
# Create Regression Model with the
reg_model <- lm(SalePrice ~ OverallQual+BsmtFinSF1+GarageArea+TotRmsAbvGrd+TotalBathrooms+Fireplaces+AgeBuilt+AgeRemodAdd, data = House_num)
summary(reg_model)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + BsmtFinSF1 + GarageArea +
## TotRmsAbvGrd + TotalBathrooms + Fireplaces + AgeBuilt + AgeRemodAdd,
## data = House_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -263084 -22065 -2886 16510 344730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -42491.6 9930.6 -4.28 2.1e-05 ***
## OverallQual 23592.9 1449.6 16.28 < 2e-16 ***
## BsmtFinSF1 15936.3 1393.3 11.44 < 2e-16 ***
## GarageArea 14355.2 1657.5 8.66 < 2e-16 ***
## TotRmsAbvGrd 11747.6 1097.5 10.70 < 2e-16 ***
## TotalBathrooms 5293.0 3149.4 1.68 0.093 .
## Fireplaces 13219.6 2180.6 6.06 2.0e-09 ***
## AgeBuilt -90.2 62.6 -1.44 0.150
## AgeRemodAdd -427.5 80.6 -5.31 1.4e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37600 on 891 degrees of freedom
## Multiple R-squared: 0.792, Adjusted R-squared: 0.79
## F-statistic: 423 on 8 and 891 DF, p-value: <2e-16
anova(reg_model)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## OverallQual 1 3.82e+12 3.82e+12 2709.2 < 2e-16 ***
## BsmtFinSF1 1 3.18e+11 3.18e+11 225.4 < 2e-16 ***
## GarageArea 1 2.03e+11 2.03e+11 143.7 < 2e-16 ***
## TotRmsAbvGrd 1 3.14e+11 3.14e+11 222.6 < 2e-16 ***
## TotalBathrooms 1 2.18e+10 2.18e+10 15.5 9.1e-05 ***
## Fireplaces 1 3.90e+10 3.90e+10 27.7 1.8e-07 ***
## AgeBuilt 1 1.46e+10 1.46e+10 10.3 0.0014 **
## AgeRemodAdd 1 3.97e+10 3.97e+10 28.2 1.4e-07 ***
## Residuals 891 1.26e+12 1.41e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The coefficients for each variable have associated p-values (Pr(>|t|)). These p-values indicate whether each predictor variable is statistically significant in predicting the response variable. In this case, all predictor variables have very small p-values (<< 0.05), suggesting they are statistically significant . The F-statistic tests the overall significance of the model. The associated p-value (Pr(>F)) is extremely small (less than 2e-16), indicating that the model is statistically significant..
Multiple R-squared (0.791) represents the proportion of variance in the response variable (Sale Price) that is explained by the predictor variables. Adjusted R-squared (0.8789) adjusts for the number of predictor variables.
The model seems to have a good fit (high R-squared value), and the individual predictor variables appear to be statistically significant in predicting Sale Price.
Post Analysis of the model
# Residual Analysis
residuals <- residuals(reg_model)
# Plot residuals vs. fitted values
plot(reg_model$fitted.values, residuals, main="Residuals vs Fitted", xlab="Fitted values", ylab="Residuals")
abline(h=0, col="red", lty=2)
# Plot a histogram of residuals
hist(residuals, main="Histogram of Residuals", xlab="Residuals")
# Check for Normality of Residuals
# Q-Q plot of standardized residuals
qqnorm(residuals,col="red")
qqline(residuals,col="red")
The plot of residuals vs. fitted values does not exhibit any
clear patterns, and the residuals appear to be randomly scattered around
the horizontal axis. This suggests that the model is appropriately
capturing the relationship between the predictors and the response
variable.. The Q-Q plot of standardized residuals shows
that the points closely follow the diagonal line. This suggests that the
residuals are approximately normally distributed, which is a positive
indication for the normality assumption. There are no
clear trends, U-shapes, or other systematic patterns in the residuals.
This further supports the idea that the model is capturing the
underlying patterns in the data..
Testing the data against the predict data and performing similar operations on the data
raw_data_test <- read.csv("data/Predict.csv")
# Specify the variables to normalize
variables_to_normalize_test <- c("LotArea", "BsmtFinSF1", "GarageArea")
# Extract the selected variables
data_to_normalize_test <- raw_data_test[, variables_to_normalize_test]
# Calculate mean and standard deviation for normalization
means <- colMeans(data_to_normalize_test)
std_devs <- apply(data_to_normalize_test, 2, sd)
# Z-score normalization
normalized_data <- scale(data_to_normalize_test, center = means, scale = std_devs)
# Add the normalized variables back to the original data
raw_data_test[, variables_to_normalize_test] <- normalized_data
#Converting the baths information to Half and Full baths
raw_data_test$ConvertedHalfBath <- ifelse(raw_data_test$HalfBath == 0, 0,
ifelse(raw_data_test$HalfBath == 1, 0.5,
ifelse(raw_data_test$HalfBath == 2, 1, NA)))
# Calculate ages from the YrSold to find the properity of the age
raw_data_test$AgeBuilt <- raw_data_test$YrSold - raw_data_test$YearBuilt
raw_data_test$AgeRemodAdd <- raw_data_test$YrSold - raw_data_test$YearRemodAdd
raw_data_test$TotalBathrooms = raw_data_test$FullBath + raw_data_test$ConvertedHalfBath
#test regression model
predictions <- predict(reg_model, newdata = raw_data_test)
# Assuming 'actual_values' is the column in 'test_data' containing the actual SalePrice values
actual_values <- raw_data_test$SalePrice
# Calculate evaluation metrics
mse <- mean((predictions - actual_values)^2)
rmse <- sqrt(mse)
mae <- mean(abs(predictions - actual_values))
r_squared <- 1 - (sum((actual_values - predictions)^2) / sum((actual_values - mean(actual_values))^2))
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 842614992
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 29028
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 22254
cat("R-squared:", r_squared, "\n")
## R-squared: 0.7725
# Optionally, you can also visualize the predictions vs. actual values
plot(predictions, actual_values, main="Predicted vs Actual", xlab="Predicted", ylab="Actual")
abline(a = 0, b = 1, col = "red", lty = 2) # Add a diagonal line for reference
A comprehensive evaluation of our linear regression model for
predicting Sales Prices using both quantitative metrics and visual
examination. The model was trained on a dataset of 900 observations, and
its performance was assessed on a separate test dataset of 90
observations..
Evaluation Metrics: Mean Squared Error (MSE): The model achieved a Mean Squared Error of 8.4e+08 signifying the average squared difference between predicted and actual Sale Prices. Lower MSE values are indicative of better predictive accuracy..
Root Mean Squared Error (RMSE): With an RMSE of 28,983, our model’s predictions, on average, deviate by approximately $29,028 from the actual Sale Prices. A lower RMSE suggests improved accuracy..
Mean Absolute Error (MAE): The Mean Absolute Error is 22,431, reflecting the average absolute difference between predicted and actual Sale Prices. This metric is useful for understanding the average magnitude of prediction errors..
R-squared: The R-squared value of 0.7725 indicates that our model explains approximately 77.25% of the variance in Sale Prices. A higher R-squared suggests a better fit to the data..
Predicted vs. Actual Values Plot: A visual inspection of the predicted vs. actual values plot further supports the model’s effectiveness. The plot exhibits a clear linear relationship, indicating that the model’s predictions align closely with the actual Sale Prices. The consistency in the alignment across the range of observations suggests that our linear regression model is capturing the underlying patterns in the data..
Descision tree Model
#Decision Tree Model with out pruning the data
library(rpart)
# Build the decision tree model
tree_model <- rpart(SalePrice ~ LotArea+OverallQual+BsmtFinSF1+TotRmsAbvGrd+Fireplaces+GarageArea+AgeRemodAdd, data = House_num, method = 'anova' )
# Display the complexity parameter table
printcp(tree_model)
##
## Regression tree:
## rpart(formula = SalePrice ~ LotArea + OverallQual + BsmtFinSF1 +
## TotRmsAbvGrd + Fireplaces + GarageArea + AgeRemodAdd, data = House_num,
## method = "anova")
##
## Variables actually used in tree construction:
## [1] BsmtFinSF1 GarageArea OverallQual TotRmsAbvGrd
##
## Root node error: 6e+12/900 = 6.7e+09
##
## n= 900
##
## CP nsplit rel error xerror xstd
## 1 0.478 0 1.00 1.00 0.090
## 2 0.116 1 0.52 0.52 0.047
## 3 0.058 2 0.41 0.41 0.044
## 4 0.028 3 0.35 0.36 0.036
## 5 0.020 4 0.32 0.35 0.038
## 6 0.018 5 0.30 0.35 0.038
## 7 0.014 6 0.28 0.34 0.035
## 8 0.011 7 0.27 0.32 0.031
## 9 0.010 8 0.26 0.31 0.031
plotcp(tree_model)
As we go down the table, CP decreases, indicating less complex
trees. However, rel error and xerror also increase, meaning the
predictions become less accurate.At nsplit = 7, the relative error
decreases but the cross validation error increases indicates the point
at which the data overfits
#Plotting the descision tree model
library(rattle)
fancyRpartPlot(tree_model)
# Build the decision tree model with pruning and a minimum split of 60
tree_model_pruned <- rpart(
SalePrice ~ LotArea + OverallQual + BsmtFinSF1 + TotRmsAbvGrd + Fireplaces + GarageArea + AgeRemodAdd,
data = House_num,
method = 'anova',
control = rpart.control(minsplit = 60, cp = 0.01)
)
# Display the complexity parameter table for the pruned tree
printcp(tree_model_pruned)
##
## Regression tree:
## rpart(formula = SalePrice ~ LotArea + OverallQual + BsmtFinSF1 +
## TotRmsAbvGrd + Fireplaces + GarageArea + AgeRemodAdd, data = House_num,
## method = "anova", control = rpart.control(minsplit = 60,
## cp = 0.01))
##
## Variables actually used in tree construction:
## [1] BsmtFinSF1 GarageArea OverallQual
##
## Root node error: 6e+12/900 = 6.7e+09
##
## n= 900
##
## CP nsplit rel error xerror xstd
## 1 0.478 0 1.00 1.00 0.090
## 2 0.116 1 0.52 0.53 0.047
## 3 0.058 2 0.41 0.41 0.044
## 4 0.028 3 0.35 0.37 0.034
## 5 0.018 4 0.32 0.35 0.034
## 6 0.014 5 0.30 0.35 0.034
## 7 0.011 6 0.29 0.34 0.033
## 8 0.010 7 0.28 0.33 0.033
plotcp(tree_model_pruned)
library(rattle)
fancyRpartPlot(tree_model_pruned)
Analysis before and after pruning the data
# Before Pruning
predictions_before_pruning_test <- predict(tree_model, newdata = raw_data_test)
mse_before_pruning_test <- mean((predictions_before_pruning_test - raw_data_test$SalePrice)^2)
accuracy_before_pruning_test <- 1 - mse_before_pruning_test/var(raw_data_test$SalePrice)
# After Pruning
predictions_after_pruning_test <- predict(tree_model_pruned, newdata = raw_data_test)
mse_after_pruning_test <- mean((predictions_after_pruning_test - raw_data_test$SalePrice)^2)
accuracy_after_pruning_test <- 1 - mse_after_pruning_test/var(raw_data_test$SalePrice)
# Display Results
cat("Accuracy Before Pruning (Test Data):", round(accuracy_before_pruning_test * 100, 2), "%\n")
## Accuracy Before Pruning (Test Data): 65.65 %
cat("Accuracy After Pruning (Test Data):", round(accuracy_after_pruning_test * 100, 2), "%\n")
## Accuracy After Pruning (Test Data): 62.59 %
Before pruning, your model achieved an accuracy of 65.65% on the test data. This is the performance of the model without any pruning, meaning the tree was allowed to grow without restrictions..
After pruning, the model achieved an accuracy of 62.59% on the test data. Pruning involves removing branches from the tree to prevent overfitting. In this case, it seems that pruning led to a decrease in accuracy. While pruning can help prevent overfitting on the training data, it may result in a slightly less accurate model on the test data..
# Make predictions on the test data
predictions <- predict(tree_model, newdata = raw_data_test, type = "vector")
# Check the structure of predictions
str(predictions)
## Named num [1:90] 125471 125471 202039 202039 125471 ...
## - attr(*, "names")= chr [1:90] "1" "2" "3" "4" ...
# Assuming your target variable is called 'target_variable'
# Create a vector of predicted values
predicted_values <- as.numeric(predictions)
# Calculate evaluation metrics based on your specific task (e.g., Mean Absolute Error for regression)
mae <- mean(abs(predicted_values - raw_data_test$SalePrice))
# Print the evaluation metric
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 27362
# Calculate the R-squared value manually
actual_values <- raw_data_test$SalePrice
residuals <- actual_values - predictions
ss_total <- sum((actual_values - mean(actual_values))^2)
ss_residual <- sum(residuals^2)
r_squared <- 1 - (ss_residual / ss_total)
# Print the R-squared value
print(paste("R-squared value:", round(r_squared, 4)))
## [1] "R-squared value: 0.6527"
# Calculate the adjusted R-squared value
num_predictors <- length(tree_model$variable.importance) # Number of predictors in the model
num_obs <- nrow(raw_data_test) # Number of observations
adjusted_r_squared <- 1 - (1 - r_squared) * ((num_obs - 1) / (num_obs - num_predictors - 1))
# Print the adjusted R-squared value
print(paste("Adjusted R-squared value:", round(adjusted_r_squared, 4)))
## [1] "Adjusted R-squared value: 0.623"
Mean Absolute Error (MAE):
The Tree Model has a higher MAE (27362) compared to the Linear Model (22254).. A lower MAE indicates better model performance, so the Linear Model performs better in terms of MAE..
The Linear Model outperforms the Tree Model in terms of Mean Absolute Error (MAE).. The Linear Model has a relatively high R-squared value, indicating a good fit to the data.. Consider the specific requirements of your task and the interpretability of each model when choosing the best model for your scenario.
Classification Model
library(dplyr)
library(ROCR)
# Loading House_Prices csv data
House_Prices <- read.csv("data/House_Prices.csv", header = TRUE, sep = ",", stringsAsFactors = TRUE)
# Creating a binary variable 'ConvertedOverallQual' based on condition
House_Prices$ConvertedOverallQual <- ifelse(House_Prices$OverallQual < 7, 0,
ifelse(House_Prices$OverallQual >= 7, 1, NA))
# Converting 'ConvertedOverallQual' to a factor
House_Prices$ConvertedOverallQual <- as.factor(House_Prices$ConvertedOverallQual)
# Removing rows with NA in the response variable
House_Prices <- House_Prices[!is.na(House_Prices$ConvertedOverallQual), ]
House_Prices <- House_Prices %>% dplyr::select(-OverallQual)
# Building logistic regression model
class_model <- glm(ConvertedOverallQual ~. , data = House_Prices, family = "binomial")
summary(class_model)
##
## Call:
## glm(formula = ConvertedOverallQual ~ ., family = "binomial",
## data = House_Prices)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.311 -0.347 -0.138 0.191 3.377
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.65e+01 1.81e+02 0.48 0.63222
## LotArea -3.36e-05 9.23e-06 -3.64 0.00027 ***
## YearBuilt 1.07e-02 6.19e-03 1.72 0.08466 .
## YearRemodAdd 1.77e-02 9.26e-03 1.91 0.05556 .
## BsmtFinSF1 -1.91e-03 3.45e-04 -5.54 3.1e-08 ***
## FullBath 3.76e-01 3.31e-01 1.13 0.25680
## HalfBath -1.26e-01 2.59e-01 -0.49 0.62672
## BedroomAbvGr -6.62e-01 2.56e-01 -2.58 0.00979 **
## TotRmsAbvGrd 2.11e-01 1.46e-01 1.45 0.14795
## Fireplaces 1.71e-01 2.08e-01 0.82 0.41145
## GarageArea 1.96e-03 1.03e-03 1.90 0.05679 .
## YrSold -7.53e-02 9.04e-02 -0.83 0.40507
## SalePrice 4.30e-05 5.10e-06 8.43 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1195.32 on 899 degrees of freedom
## Residual deviance: 471.83 on 887 degrees of freedom
## AIC: 497.8
##
## Number of Fisher Scoring iterations: 7
library(readxl)
BA_Predict <- read_excel("data/Predict.xlsx")
# Creating a binary variable 'ConvertedOverallQual' based on condition
BA_Predict$ConvertedOverallQual <- ifelse(BA_Predict$OverallQual < 7, 0,
ifelse(BA_Predict$OverallQual >= 7, 1, NA))
# Converting 'ConvertedOverallQual' to a factor
BA_Predict$ConvertedOverallQual <- as.factor(BA_Predict$ConvertedOverallQual)
# Removing rows with NA in the response variable
BA_Predict <- BA_Predict[!is.na(BA_Predict$ConvertedOverallQual), ]
BA_Predict <- BA_Predict %>% dplyr::select(-OverallQual)
library(caret)
predict_reg=predict(class_model,newdata=BA_Predict, type = "response")
predict_reg=ifelse(predict_reg > 0.5,1,0)
predict_reg = factor(predict_reg, levels = levels(House_Prices$ConvertedOverallQual))
predict_reg
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 0
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
## 1 0 1 1 1 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 1 0 0 0
## 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
## 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 1 0 1 0 0 0 1
## 79 80 81 82 83 84 85 86 87 88 89 90
## 0 0 0 1 1 1 0 0 0 1 1 1
## Levels: 0 1
levels(BA_Predict$ConvertedOverallQual)
## [1] "0" "1"
levels(predict_reg)
## [1] "0" "1"
str(predict_reg)
## Factor w/ 2 levels "0","1": 1 1 2 2 1 1 2 2 1 2 ...
## - attr(*, "names")= chr [1:90] "1" "2" "3" "4" ...
table(predict_reg)
## predict_reg
## 0 1
## 54 36
table(predict_reg,BA_Predict$ConvertedOverallQual)
##
## predict_reg 0 1
## 0 47 7
## 1 8 28
X = confusionMatrix(BA_Predict$ConvertedOverallQual,predict_reg)
X
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 47 8
## 1 7 28
##
## Accuracy : 0.833
## 95% CI : (0.74, 0.904)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 1.59e-06
##
## Kappa : 0.651
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.870
## Specificity : 0.778
## Pos Pred Value : 0.855
## Neg Pred Value : 0.800
## Prevalence : 0.600
## Detection Rate : 0.522
## Detection Prevalence : 0.611
## Balanced Accuracy : 0.824
##
## 'Positive' Class : 0
##
True Positive (TP): 47 (Actual class: 0, Predicted class: 0). False Positive (FP): 8 (Actual class: 0, Predicted class: 1). False Negative (FN): 7 (Actual class: 1, Predicted class: 0). True Negative (TN): 28 (Actual class: 1, Predicted class: 1).
accuracy <- X$overall["Accuracy"]
precision <- X$byClass["Pos Pred Value"]
accuracy
## Accuracy
## 0.8333
precision
## Pos Pred Value
## 0.8545
Accuracy: 83.33% Accuracy is the proportion of correctly classified instances out of the total number of instances. In this case, the model has an overall accuracy of 83.33%, meaning it correctly predicted the class for approximately 83.33% of the observations..
Positive Predictive Value (Precision): 85.45% Precision, also known as the Positive Predictive Value, measures the proportion of true positive predictions among all positive predictions made by the model. In this case, the positive predictive value is 85.45%. This indicates that when the model predicts the positive class, it is correct about 85.45% of the time..
ROC Curve for the Metrics
predict_reg=predict(class_model,newdata=BA_Predict, type = "response")
predict_reg=ifelse(predict_reg > 0.5,1,0)
pred <- prediction(predict_reg, BA_Predict$ConvertedOverallQual)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve", col = "blue")
abline(a = 0, b = 1, col = "red")
ROC curve visually represents the trade-off between true positive rate and false positive rate at different probability thresholds. The reference line (diagonal line) represents a random classifier, and the goal is for the ROC curve to be as far away from this line as possible (toward the upper-left corner).. ROC curves for accuracy, sensitivity, and precision, respectively.
auc.perf = performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.8273
acc.perf = performance(pred, measure = "acc")
plot(acc.perf)
rec.perf = performance(pred, measure = "rec")
plot(rec.perf)
prec.perf = performance(pred, measure = "prec")
plot(prec.perf)
Subsequent blocks focus on other important metrics such as accuracy, recall (sensitivity), and precision. These metrics provide insights into different aspects of the model’s performance. These graphs curves can help us choose an appropriate decision threshold for your classification model